#! python3
# phageDisplayUbvTrim.py - Analyse paired sequencing and ELISA results for UbVs by showing only non-conserved amino acid residues and highlighting regions that were targeted for diversification.

####################################
#    Preamble
####################################

# Usage notes:
# * This code is dependent on the style of the worksheet used as the sequencing/ELISA data source. This will be entirely based upon the output from the "phageDisplayElisaAnalysis.py" code.
# * Any assumptions that were made from previous code will be retained, i.e. if the data source is the output from "phageDisplaySeqAnalysis.py" then all alignments will exclude sequences that weren't full length and those that have premature stop codons.
# * Only amino acid sequences are trimmed, nucleotide sequences are excluded from this analysis.
# * This code will assume that there is an extra amino acid residue, K, leftover from the FLAG tag when using the output from the "phageDisplaySeqAnalysis.py" code. Without this, it will trim incorrectly.

# Compatibility notes:
# * Advised to use Biopython 1.77 and no later version. If using a more current version, change alignment codes from "alignment.format(format_spec)" to "format(alignment, format_spec".
# * If using Spyder as your IDE, use a version that isn't version 5. This version for some reason has conflicts with the xlsxwriter package and won't get past importing modules.
# * This code is confirmed to work with python 3.8.8. Later versions may work but have not been verified.
# * Confirmed to work in Windows, unconfirmed in Macs and Linux but should work in theory (may need to change lines regarding path names so that the format matches the OS, currently these are optimised for Windows' format).

# To do:
# * Add custom warnings for common errors that break script, give suggestions as to what went wrong.
# * Make code to read sequences from excel file without need for fasta file.

####################################
#    Modules
####################################

import os, re, logging, xlsxwriter, pandas
from collections import Counter, OrderedDict

####################################
#    Functions
####################################

# Compare components (x and y) of two strings (a and b, respectively) and return a string of the differences (diff) found in string b.
def compare(a, b):
    diff = ''
    for x, y in zip(a, b):
        if x == y:
            diff += '-'
        else:
            diff += y
    return diff

####################################
#    Classes
####################################

# Ordered list of counts.
class OrderedCounter(Counter, OrderedDict):
    pass

####################################
#    Code
####################################

##################
# Colours for print functions.
##################

cyan = lambda text: '\033[0;36m' + text + '\033[0m'
green = lambda text: '\033[0;32m' + text + '\033[0m'

##################
# Setup
##################

# Change working directory.
print(green('\nScript started. This will trim UbV sequences so that only diversified regions remain and pair them with their corresponding ELISA results for comparison.') +
      cyan(''''\n\nEnter folder location/path where sequences are located:

This will also be the location for the final output.'''))
path = input()
path = path.replace('\\', '/')
os.chdir(path)

# Logging setup.
pathRegex = re.compile(r'([\w.]+)$')
locList = pathRegex.findall(path)
locStr = str(locList[0])
logging.basicConfig(filename = path + "/" + locStr + ".log",
                    level = logging.INFO,
                    format = '%(asctime)s - %(message)s',
                    filemode = 'w')
logging.info('Working directory changed to %s.' % (path))

# Choose analysed ELISA file.
print(cyan('''\nEnter the analysed ELISA data file name:

Must be in .xlsx format and in the same folder as the previous alignment file. Include the file extension in the name.'''))
elisaFile = input()
elisaFilePath = path + '/' + elisaFile
extensionRegex = re.compile(r'([.].*)')
elisaFileShort = re.sub(r"[.].*", "", elisaFile)

# Choose amino acid sequence alignment file.
print(cyan('''\nEnter amino acid alignment file name:

Must be in .fasta format. Include the file extension in the name.'''))
seqFile = input()
seqFilePath = path + '/' + seqFile
logging.info('%s chosen as amino acid sequence data source.' % (seqFilePath))

##################
# Data Analysis
##################

# Data extraction.
seqRegex = re.compile(r'([A-Z]{10,})')
stopRegex = re.compile(r'([*]+[A-Z]*)')
logging.info('Amino acid sequence names read from %s.' % (seqFile)) 
# Extract amino acid sequences. 
with open(seqFilePath, 'r') as inFile:
    lines = inFile.read()
    lines = lines.replace('\n', '')
    lines = stopRegex.sub('', lines)
    aaList = seqRegex.findall(lines)
logging.info('Amino acid sequences read from %s.' %  (seqFile))

# Extract statistics and well IDs from ELISA file.
allData = pandas.read_excel(elisaFilePath, sheet_name='Unique AA Seq', skiprows = 1, usecols = range(81, 87))       # Need to change this so it can be more variable and dependent on alignment length?
logging.info('%s data read.' % (elisaFile))
maxList = allData.iloc[:, 0].tolist()
logging.info('Maximum values extracted.')
minList = allData.iloc[:, 1].tolist()
logging.info('Minimum values extracted.')
medianList = allData.iloc[:, 2].tolist()
logging.info('Median values extracted.')
meanList = allData.iloc[:, 3].tolist()
logging.info('Mean values extracted.')
devList = allData.iloc[:, 4].tolist()
logging.info('Standard deviation values extracted.')
wellList = allData.iloc[:, 5].tolist()
logging.info('Wells extracted.')

# For each amino acid sequence, replace non-diversified regions with dashes.
# Remove amino acid prior to start codon.
shortaaList = []
for seq in aaList:
    shortaaSeq = seq[1:]
    shortaaList.append(shortaaSeq)
logging.info('Initial non-ubiquitin amino acid removed from all sequences.')
# Compare UbV sequences against a consensus sequence and replace conserved amino acids with dashes.
consensusSeq = 'MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGGG'
consensusLen = len(consensusSeq)
conservedList = []
for ubvSeq in shortaaList:
    conservedList.append(compare(consensusSeq,ubvSeq))
logging.info('List of conserved sequences created.')

# Create list of unique amino acid sequences ordered by frequency.
unique = OrderedCounter(conservedList)
unique = unique.most_common()
uniqueDict = dict(unique)
logging.info('Dictionary of unique conserved sequences created.')

##################
# Export as .xlsx.
##################

# Create workbook.
workbook = xlsxwriter.Workbook(path + '/' + elisaFileShort + '_conservation.xlsx')
logging.info('Excel spreadsheet created as "%s_conservation.xlsx".' % (elisaFileShort))

# Cell formatting rules. 
# General.
general_format = workbook.add_format()
general_format.set_align('center')
general_format.set_align('vcenter')
# Titles.
title_format = workbook.add_format({'bold': True, 'font_size': 12})
title_format.set_align('center')
# Statistics.
stats_format = workbook.add_format({'num_format': '#,##0.0'})
stats_format.set_align('center')
stats_format.set_align('vcenter')
# Wells.
wellList_format = workbook.add_format({'font_size': 11})
# Residue numbers.
residue_format = workbook.add_format({'font_size': 10})
residue_format.set_align('center')
# Sequences.
sequence_format = workbook.add_format({'font_size': 10})
sequence_format.set_bg_color('white')
sequence_format.set_align('center')
sequence_format.set_align('vcenter')
sequence_format.set_font_name('Lucida Console')
# Region 1.
region1_format = workbook.add_format()
region1_format.set_bg_color('#BD7191')
# Region 2.
region2_format = workbook.add_format()
region2_format.set_bg_color('#8FC1C0')
# Region 3.
region3_format = workbook.add_format()
region3_format.set_bg_color('#DCA16A')
logging.info('Cell formatting rules are set.')

##################
# 'Conserved AA' worksheet.
##################

# Create worksheet for unique conserved amino acid sequences.
worksheet1 = workbook.add_worksheet('Conserved AA')
worksheet1.hide_gridlines(option = 2)
worksheet1.set_column(1, consensusLen, 2)
worksheet1.set_column(consensusLen + 2, consensusLen + 5, 8)
worksheet1.freeze_panes(0, 1)
logging.info('Conserved AA worksheet created.')
# Assign IDs to each unique amino acid sequence.
worksheet1.write(1, 0, 'ID', title_format)
numberList = list(range(1,len(uniqueDict)+1))
row1 = 3
for number in numberList:
    worksheet1.write(row1, 0, number, general_format)
    row1 += 1
logging.info('IDs written to Conserved AA worksheet.')
# Write amino acid residue numbers above sequences.
numberList = list(range(1, consensusLen + 1))
residueCol = 1
for number in numberList:
    worksheet1.write(2, residueCol, number, residue_format)
    residueCol += 1
# Write unique amino acid sequences to Unique Seq - AA worksheet.
worksheet1.write(0, 6, 'Amino Acid Sequence', title_format)
seqRow = 3
seqCol = 1
for seq in uniqueDict.keys():
    letterList = list(seq)
    for letter in letterList:
        worksheet1.write(seqRow, seqCol, letter, sequence_format)
        seqCol += 1
    seqRow += 1
    seqCol = 1
logging.info('Unique conserved sequences written to Conserved AA worksheet.')
# Add counts for each unique amino acid sequence.
worksheet1.write(1, consensusLen + 1, 'Count', title_format)
count = list(uniqueDict.values())
countRow = 3
countCol = consensusLen + 1
for number in count:
    worksheet1.write(countRow, countCol, number, general_format)
    countRow += 1
logging.info('Counts written to Conserved AA worksheet.')

# Write statistics to 'Conserved AA' worksheet.
# Max.
maxRow = 3
maxCol = consensusLen + 2
worksheet1.write(1, consensusLen + 2, 'Max.', title_format)
for number in maxList:
    worksheet1.write(maxRow, maxCol, number, stats_format)
    maxRow += 1
logging.info('Maximum values written to worksheet.')
# Min.
minRow = 3
minCol = consensusLen + 3
worksheet1.write(1, consensusLen + 3, 'Min.', title_format)
for number in minList:
    worksheet1.write(minRow, minCol, number, stats_format)
    minRow += 1
logging.info('Minimum values written to worksheet.')
# Median.
medianRow = 3
medianCol = consensusLen + 4
worksheet1.write(1, consensusLen + 4, 'Median', title_format)
for number in medianList:
    worksheet1.write(medianRow, medianCol, number, stats_format)
    medianRow += 1
logging.info('Median values written to worksheet.')
# Mean.
meanRow = 3
meanCol = consensusLen + 5
worksheet1.write(1, consensusLen + 5, 'Mean', title_format)
for number in meanList:
    worksheet1.write(meanRow, meanCol, number, stats_format)
    meanRow += 1
logging.info('Mean values written to worksheet.')
# St. dev.
stdevRow = 3
stdevCol = consensusLen + 6
worksheet1.write(1, consensusLen + 6, 'St. Dev.', title_format)
for number in devList:
    worksheet1.write(stdevRow, stdevCol, number, stats_format)
    stdevRow += 1
logging.info('Standard deviation values written to worksheet.')
# Wells.
wellRow = 3
wellCol = consensusLen + 7
worksheet1.write(1, consensusLen + 7, 'Wells', title_format)
for well in wellList:
    worksheet1.write(wellRow, wellCol, well, wellList_format)
    wellRow += 1
logging.info('Wells written to worksheet.')

# Conditional formatting for statistics.
worksheet1.conditional_format(2, consensusLen + 2, len(unique) + 2, consensusLen + 6, {'type': '2_color_scale', 'min_color': '#FAFAFA', 'max_color': '#008000'})
logging.info('Conditional formatting applied to statistics.')

##################
# Libraries.
##################

print(cyan('''\nChoose which library design to use (type corresponding number in square brackets):

[1] Library 1 (Ernst et al., 2013).
Diversified residues: (Region 1) 2, 4, 6, 8-12, 14, (region 2) 35, 37, 39-40, 42, 44, 46-49, (region 3) 62-64, 66, 68, 70-72.

[2] Library 2 (Ernst et al., 2013).
Diversified residues: (Region 1) 2, 4, 6, 8-12, 14, (region 2) , 42, 44, 46-49, (region 3) 62-64, 66, 68, 70-78.

Type any number not included above to skip this step.'''))
library = input()
# Library 1 (Ernst et al., 2013).
if library == '1':
    # Region 1 formatting.
    worksheet1.write(1, 7, 'Region 1', title_format)
    worksheet1.conditional_format(3, 2, len(unique) + 2, 2, {'type':'no_blanks', 'format': region1_format})
    worksheet1.conditional_format(3, 4, len(unique) + 2, 4, {'type':'no_blanks', 'format': region1_format})
    worksheet1.conditional_format(3, 6, len(unique) + 2, 6, {'type':'no_blanks', 'format': region1_format})
    worksheet1.conditional_format(3, 8, len(unique) + 2, 12, {'type':'no_blanks', 'format': region1_format})
    worksheet1.conditional_format(3, 14, len(unique) + 2, 14, {'type':'no_blanks', 'format': region1_format})
    logging.info('Region 1 coloured.')
    # Region 2 formatting.
    worksheet1.write(1, 41, 'Region 2', title_format)
    worksheet1.conditional_format(3, 35, len(unique) + 2, 35, {'type':'no_blanks', 'format': region2_format})
    worksheet1.conditional_format(3, 37, len(unique) + 2, 37, {'type':'no_blanks', 'format': region2_format})
    worksheet1.conditional_format(3, 39, len(unique) + 2, 40, {'type':'no_blanks', 'format': region2_format})
    worksheet1.conditional_format(3, 42, len(unique) + 2, 42, {'type':'no_blanks', 'format': region2_format})
    worksheet1.conditional_format(3, 44, len(unique) + 2, 44, {'type':'no_blanks', 'format': region2_format})
    worksheet1.conditional_format(3, 46, len(unique) + 2, 49, {'type':'no_blanks', 'format': region2_format})
    logging.info('Region 2 coloured.')
    # Region 3 formatting.
    worksheet1.write(1, 70, 'Region 3', title_format)
    worksheet1.conditional_format(3, 62, len(unique) + 2, 64, {'type':'no_blanks', 'format': region3_format})
    worksheet1.conditional_format(3, 66, len(unique) + 2, 66, {'type':'no_blanks', 'format': region3_format})
    worksheet1.conditional_format(3, 68, len(unique) + 2, 68, {'type':'no_blanks', 'format': region3_format})
    worksheet1.conditional_format(3, 70, len(unique) + 2, 72, {'type':'no_blanks', 'format': region3_format})
    logging.info('Region 3 coloured.')
# Library 2 (Ernst et al., 2013).
elif library == '2':
    # Region 1 formatting.
    worksheet1.write(1, 7, 'Region 1', title_format)
    worksheet1.conditional_format(3, 2, len(unique) + 2, 2, {'type':'no_blanks', 'format': region1_format})
    worksheet1.conditional_format(3, 4, len(unique) + 2, 4, {'type':'no_blanks', 'format': region1_format})
    worksheet1.conditional_format(3, 6, len(unique) + 2, 6, {'type':'no_blanks', 'format': region1_format})
    worksheet1.conditional_format(3, 8, len(unique) + 2, 12, {'type':'no_blanks', 'format': region1_format})
    worksheet1.conditional_format(3, 14, len(unique) + 2, 14, {'type':'no_blanks', 'format': region1_format})
    logging.info('Region 1 coloured.')
    # Region 2 formatting.
    worksheet1.write(1, 41, 'Region 2', title_format)
    worksheet1.conditional_format(3, 42, len(unique) + 2, 42, {'type':'no_blanks', 'format': region2_format})
    worksheet1.conditional_format(3, 44, len(unique) + 2, 44, {'type':'no_blanks', 'format': region2_format})
    worksheet1.conditional_format(3, 46, len(unique) + 2, 49, {'type':'no_blanks', 'format': region2_format})
    logging.info('Region 2 coloured.')
    # Region 3 formatting.
    worksheet1.write(1, 70, 'Region 3', title_format)
    worksheet1.conditional_format(3, 62, len(unique) + 2, 64, {'type':'no_blanks', 'format': region3_format})
    worksheet1.conditional_format(3, 66, len(unique) + 2, 66, {'type':'no_blanks', 'format': region3_format})
    worksheet1.conditional_format(3, 68, len(unique) + 2, 68, {'type':'no_blanks', 'format': region3_format})
    worksheet1.conditional_format(3, 70, len(unique) + 2, 78, {'type':'no_blanks', 'format': region3_format})
    logging.info('Region 3 coloured.')
else:
    print(cyan('\nNumber not recognised, no library design applied.'))

##################
# Final workbook formatting.
##################

# Transform data into proper Excel-formatted tables without any design style applied.
worksheet1.add_table(3, 0, len(unique) + 2, consensusLen + 7, {'header_row': False, 'style': None})

# Close .xlsx file.
workbook.close()

# Conclusion.
print(green('\nExcel conserved sequence alignment with ELISA scores saved as %s_conservation.xlsx.' % (elisaFileShort)))
logging.info('Excel file exported as %s_conservation.xlsx.' % (elisaFileShort))
print(green('\nAnalysis finished. See log file for details.'))
logging.info('ubvTrims.py finished running.')

# Shutdown logging.
logging.shutdown()